CCBR-620

Maggie Cam

Oct 15, 2015

Background

To detect different gene expression in a small subpopulation (<1%) of skin, call Merkel cell, upon shh, fgf20, and edar mutation. Aim to analyze the connection between the three different pathways, and try to get clue of the genes participating in Merkel cell development.

E15.5 embryo skin from 2 mutant lines:

edar mutant = mutation in gene ectodysplasin-A receptor, Single point mutation (Mutation details: This allele involves a G to A transition mutation at nucleotide 1,135 that causes the amino acid change: glutamate to lysine at position 379 (E379K). (J:56496))
phenotype (http://www.informatics.jax.org/allele/allgenoviews/MGI:1856018) = abnormal coat/hair morphology, darkened coat color

fgf20 mutant = fgf20-β-galactosidase knock-in allele phenotype= no guard hair in adult mouse back skin

Data Processing

source("http://bioconductor.org/biocLite.R")
## Bioconductor version 3.2 (BiocInstaller 1.20.0), ?biocLite for help
#biocLite("edgeR")
library(Rsubread)
library(limma)
library(edgeR)
library(ggplot2)
library(rgl)
library(knitr)


knit_hooks$set(rgl = function(before, options, envir) {
  if (!before) {
    ## after a chunk has been evaluated
    if (rgl.cur() == 0) return()  # no active device
    name = paste(options$fig.path, options$label, sep = '')
    rgl.snapshot(paste(name, '.png', sep = ''), fmt = 'png')
    return(paste('\\includegraphics{', name, '}\n', sep = ''))
  }
})

knit_hooks$set(webgl = hook_webgl)

This part of the program was run on biowulf, and data transferred to a local directory

#Command line version
module load subread
x=$(ls *.bam)
featureCounts -p -T 8 -s 2 -p -t exon -g gene_id -a /data/maggiec/RNASeq/Genomes/mm10/gencode.vM4.all.gtf -o counts_ss.txt $x

#Used R version:
gtf="data/maggiec/RNASeq/Genomes/mm10/gencode.vM4.all.gtf"
targets <- readTargets()
fc <- featureCounts(files=targets$bam,isGTFAnnotationFile=TRUE,nthreads=32,
      annot.ext=gtf,GTF.attrType="gene_name",strandSpecific=2,isPairedEnd=TRUE)
x <- DGEList(counts=fc$counts, genes=fc$annotation)

Load data from local directory:

setwd("/Users/maggiec/GitHub/Maggie/ccbr620/")
#load("data/ssData.RData")
#fc=fc_ss
load("data/RNASeq_orig.RData")
mapped=read.delim(file = "data/mapped.txt",header = TRUE)
targets$mapped=mapped$mapped
ls()
## [1] "fc"      "gtf"     "mapped"  "targets"
targets
##               bam         Phenotype Batch    mapped
## 1  Sample_e10.bam    edar/+ control     1  76698538
## 2   Sample_e1.bam    edar/+ control     2  96350389
## 3   Sample_e2.bam    edar/+ control     2  98212381
## 4   Sample_e4.bam  edar/edar mutant     2 104174178
## 5   Sample_e5.bam  edar/edar mutant     2  97482428
## 6   Sample_e9.bam  edar/edar mutant     1  56689636
## 7   Sample_f1.bam fgf20lz/+ control     3  75223785
## 8   Sample_f2.bam fgf20lz/+ control     3  81814741
## 9   Sample_f4.bam fgf20lz/lz mutant     3  53524329
## 10  Sample_f5.bam fgf20lz/+ control     3  79315207
## 11  Sample_f6.bam fgf20lz/lz mutant     3  84455037
## 12  Sample_f7.bam fgf20lz/lz mutant     3  80703536
fc$stat
##                        Status Sample_e10.bam Sample_e1.bam Sample_e2.bam
## 1                    Assigned       30137063      38131187      38018334
## 2        Unassigned_Ambiguity         338418        423143        427446
## 3     Unassigned_MultiMapping        5540811       6724855       7144799
## 4       Unassigned_NoFeatures        2332980       2896013       3515618
## 5         Unassigned_Unmapped              0             0             0
## 6   Unassigned_MappingQuality              0             0             0
## 7  Unassigned_FragementLength              0             0             0
## 8          Unassigned_Chimera              0             0             0
## 9        Unassigned_Secondary              0             0             0
## 10     Unassigned_Nonjunction              0             0             0
## 11       Unassigned_Duplicate              0             0             0
##    Sample_e4.bam Sample_e5.bam Sample_e9.bam Sample_f1.bam Sample_f2.bam
## 1       39859361      37639209      21918421      29010380      31527017
## 2         464930        429850        251806        328766        361974
## 3        7824053       7377360       4328405       5049329       5805782
## 4        3938754       3294804       1846191       3223421       3212604
## 5              0             0             0             0             0
## 6              0             0             0             0             0
## 7              0             0             0             0             0
## 8              0             0             0             0             0
## 9              0             0             0             0             0
## 10             0             0             0             0             0
## 11             0             0             0             0             0
##    Sample_f4.bam Sample_f5.bam Sample_f6.bam Sample_f7.bam
## 1       20773860      30420667      32484170      31229089
## 2         226857        343845        360251        353216
## 3        3820429       5435442       5666179       5505908
## 4        1941022       3457653       3716923       3263560
## 5              0             0             0             0
## 6              0             0             0             0
## 7              0             0             0             0
## 8              0             0             0             0
## 9              0             0             0             0
## 10             0             0             0             0
## 11             0             0             0             0
fc1=mat=fc$counts
tfc1=t(fc1)
filter <- apply(fc1, 1, function(x) length(x[x>5])>=1)
fc1filt <- fc1[filter,]
genes <- rownames(fc1filt)

Mapping Rate:

options(scipen=9)
map=as.numeric(targets$mapped)
names(map)=sapply(strsplit(targets$bam,split="\\."),function(x) x[1])
barplot(map, col=as.numeric(as.factor(targets$Phenotype)),las = 2,cex.axis = 0.7)

QC Check: Look at raw signal distribution and median expression levels

library(ggplot2)
library(reshape)

fc1filt=as.data.frame(fc1filt)
fc1filtnames=sapply(strsplit(colnames(fc1filt),split="_"),function(x) x[2])
fc1filtnames=sapply(strsplit(fc1filtnames,split="\\."),function(x) x[1])
colnames(fc1filt)=paste(fc1filtnames,targets$Phenotype)

df.m <- melt(as.data.frame(fc1filt))
ggplot(df.m) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL) +
  theme(legend.position='right') + scale_x_log10() + ggtitle("Raw Counts")

par(mar=c(10,7,1,1))
boxplot(log(value)~variable,las=2,data=df.m,main="Raw Signal", 
    ylab="Counts",col=as.numeric(as.factor(targets$Phenotype)))

Data is normalized by TMM and quantile normalization

x <- DGEList(counts=fc$counts, genes=fc$annotation)
isexpr <- rowSums(cpm(x)>0.5) >= 2
hasannot <- rowSums(is.na(x$genes))==0
x <- x[isexpr & hasannot,keep.lib.sizes=FALSE]

#Number of filtered genes 
dim(x)
## [1] 16253    12
x <- calcNormFactors(x)

par(mfrow=c(2,6))
for (i in 1:dim(x)[2]){
plotMD(cpm(x, log=TRUE), column=i)
abline(h=0, col="red", lty=2, lwd=2)
}

Run Voom

#Do analysis for entire group
celltype <- factor(targets$Phenotype)
design <- model.matrix(~celltype)
design
##    (Intercept) celltypeedar/edar mutant celltypefgf20lz/+ control
## 1            1                        0                         0
## 2            1                        0                         0
## 3            1                        0                         0
## 4            1                        1                         0
## 5            1                        1                         0
## 6            1                        1                         0
## 7            1                        0                         1
## 8            1                        0                         1
## 9            1                        0                         0
## 10           1                        0                         1
## 11           1                        0                         0
## 12           1                        0                         0
##    celltypefgf20lz/lz mutant
## 1                          0
## 2                          0
## 3                          0
## 4                          0
## 5                          0
## 6                          0
## 7                          0
## 8                          0
## 9                          1
## 10                         0
## 11                         1
## 12                         1
## attr(,"assign")
## [1] 0 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$celltype
## [1] "contr.treatment"
v <- voom(x,design,plot=TRUE,normalize="quantile")

After Normalization

colnames(v$E)=paste(fc1filtnames,targets$Phenotype)
df.m <- melt(as.data.frame(v$E))
## Using  as id variables
ggplot(df.m) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL) +
  theme(legend.position='right') + ggtitle("Normalized Counts")

par(mar=c(10,7,1,1))
boxplot(value~variable,las=2,data=df.m,main="Normalized Signal", 
    ylab="Counts",col=as.numeric(as.factor(targets$Phenotype)))

edf=as.matrix(v$E)
tedf= t(edf)
pca=prcomp(tedf,scale.=T)
tedf1 = data.frame(tedf)
Phenotype=sapply(strsplit(targets$Phenotype,split=" "),function(x) x[1])
cell_rep=paste(Phenotype,targets$Batch,sep=".")
tedf1$group = as.factor(Phenotype)

plot(pca,type="lines")  #Decide how many PC's are relevant for plotting

pca$x[,1:3]  #look at first 3 PC's
##                               PC1       PC2         PC3
## e10 edar/+ control    -46.8250401 -40.90604   28.098641
## e1 edar/+ control      59.6977305 -83.09294  -55.749286
## e2 edar/+ control       1.1366238 -32.64203   67.385731
## e4 edar/edar mutant    77.3047336 -53.72389    1.141197
## e5 edar/edar mutant     8.3889563 -62.11561   39.384595
## e9 edar/edar mutant  -149.3487422 -35.83906  -24.546535
## f1 fgf20lz/+ control   19.5901540  50.61451    5.055829
## f2 fgf20lz/+ control  -68.5182693  52.18792    9.116811
## f4 fgf20lz/lz mutant   -3.1247422  25.74435 -107.277643
## f5 fgf20lz/+ control   35.5886477  68.42053    2.288900
## f6 fgf20lz/lz mutant   -0.5432889  71.26031   26.014322
## f7 fgf20lz/lz mutant   66.6532369  40.09195    9.087437
plot3d(pca$x[,1:3],col = as.integer(tedf1$group),type="s",size=2)
group.v<-as.vector(cell_rep)
text3d(pca$x, pca$y, pca$z, group.v, cex=1.0, adj = 1.2) 
#rgl.postscript("pca3d_indiv.pdf","pdf")

unnamed_chunk_7snapshot
You must enable Javascript to view this page properly.

Preprocessing of edar group (voom): 2 batches of pups (voom)

Design: model.matrix(~Batch+celltype)

targets[1:6,]
##              bam        Phenotype Batch    mapped
## 1 Sample_e10.bam   edar/+ control     1  76698538
## 2  Sample_e1.bam   edar/+ control     2  96350389
## 3  Sample_e2.bam   edar/+ control     2  98212381
## 4  Sample_e4.bam edar/edar mutant     2 104174178
## 5  Sample_e5.bam edar/edar mutant     2  97482428
## 6  Sample_e9.bam edar/edar mutant     1  56689636
celltype <- factor(targets$Phenotype[1:6])
Batch <- factor(targets$Batch[1:6])
cell_rep=paste(celltype,Batch,sep=".")
design1 <- model.matrix(~Batch+celltype)
design1
##   (Intercept) Batch2 celltypeedar/edar mutant
## 1           1      0                        0
## 2           1      1                        0
## 3           1      1                        0
## 4           1      1                        1
## 5           1      1                        1
## 6           1      0                        1
## attr(,"assign")
## [1] 0 1 2
## attr(,"contrasts")
## attr(,"contrasts")$Batch
## [1] "contr.treatment"
## 
## attr(,"contrasts")$celltype
## [1] "contr.treatment"
v1 <- voom(x[,1:6],design1,plot=TRUE,normalize="quantile")

edf=as.matrix(v$E[,1:6])
tedf= t(edf)
pca=prcomp(tedf,scale.=T)
tedf1 = data.frame(tedf)

Phenotype=sapply(strsplit(targets$Phenotype[1:6],split=" "),function(x) x[1])
Batch=targets$Batch[1:6]
tedf1$group = as.factor(Phenotype)
plot(pca,type="lines")  #Decide how many PC's are relevant for plotting

pca$x[,1:3]  #look at first 3 PC's
##                            PC1        PC2        PC3
## e10 edar/+ control   -39.58156 -36.729254  51.561576
## e1 edar/+ control     69.60465 -94.937753 -35.224601
## e2 edar/+ control      9.08715  21.257686  79.763216
## e4 edar/edar mutant   81.62089  63.120079 -46.958472
## e5 edar/edar mutant   18.99469  40.427968   4.626303
## e9 edar/edar mutant -139.72581   6.861275 -53.768022
plot3d(pca$x[,1:3],col = as.integer(tedf1$group),type="s",size=2)
group.v<-as.vector(paste(Phenotype,Batch))
text3d(pca$x, pca$y, pca$z, group.v, cex=1.0, adj = 1.2) 
#rgl.postscript("pca3d_indiv_edar_batch.pdf","pdf")

unnamed_chunk_8snapshot
You must enable Javascript to view this page properly.

Preprocessing of Fgf group (voom): 1 batch of pups

Design: model.matrix(~celltype)

#Process the Fgf group
targets[7:12,]
##              bam         Phenotype Batch   mapped
## 7  Sample_f1.bam fgf20lz/+ control     3 75223785
## 8  Sample_f2.bam fgf20lz/+ control     3 81814741
## 9  Sample_f4.bam fgf20lz/lz mutant     3 53524329
## 10 Sample_f5.bam fgf20lz/+ control     3 79315207
## 11 Sample_f6.bam fgf20lz/lz mutant     3 84455037
## 12 Sample_f7.bam fgf20lz/lz mutant     3 80703536
celltype <- factor(targets$Phenotype[7:12])
design2 <- model.matrix(~celltype)
design2
##   (Intercept) celltypefgf20lz/lz mutant
## 1           1                         0
## 2           1                         0
## 3           1                         1
## 4           1                         0
## 5           1                         1
## 6           1                         1
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$celltype
## [1] "contr.treatment"
v2 <- voom(x[,7:12],design2,plot=TRUE,normalize="quantile")

#fgf:
edf=as.matrix(v$E[,7:12])
tedf= t(edf)
pca=prcomp(tedf,scale.=T)
tedf1 = data.frame(tedf)

Phenotype=sapply(strsplit(targets$Phenotype[7:12],split=" "),function(x) x[1])
tedf1$group = as.factor(Phenotype)
plot(pca,type="lines")  #Decide how many PC's are relevant for plotting

pca$x[,1:3]  #look at first 3 PC's
##                             PC1        PC2       PC3
## f1 fgf20lz/+ control -15.461277   15.64215 -14.89205
## f2 fgf20lz/+ control 126.034997   21.70746 -43.05940
## f4 fgf20lz/lz mutant   5.505962 -132.36504  17.91359
## f5 fgf20lz/+ control -37.291128   29.80851  59.56308
## f6 fgf20lz/lz mutant  11.728095   51.33417  54.30037
## f7 fgf20lz/lz mutant -90.516648   13.87275 -73.82559
plot3d(pca$x[,1:3],col = as.integer(tedf1$group),type="s",size=2)
group.v<-as.vector(Phenotype)
text3d(pca$x, pca$y, pca$z, group.v, cex=1.0, adj = 1.2) 
#rgl.postscript("pca3d_indiv_fgf.pdf","pdf")

unnamed_chunk_9snapshot
You must enable Javascript to view this page properly.

Genes of Interest

genes=c("Sox2","Atoh1","Shh","Gli1","Edar","Traf6","Tnfrsf19","Lta","Ltb","Nfkb1")
vdf=cbind(v1$E,v2$E)
vdf=vdf[,c(1,2,3,4,5,6,7,8,10,9,11,12)]
genes_count=vdf[rownames(vdf) %in% genes,]

for (i in 1:length(genes)){
gbar=vdf[rownames(vdf)==genes[i],]
if(length(gbar)>0){
names(gbar)=targets$Phenotype[match(names(gbar),targets$bam)]
names(gbar)= sapply(strsplit(names(gbar),split=" "),function(x) x[1])
groupcol=as.factor(names(gbar))
barplot(2^gbar,col=as.numeric(groupcol), las = 2,main=paste(genes[i]," Counts: Normalized",sep=""),ylab="normalized cpm")
}
}

####Statistical Analysis of Edar group (lmFit)

#Run analysis of edar group:
fit1 <- lmFit(v1,design1) 
fit1 <- eBayes(fit1) 
options(digits=3) 
topgenes1=topTable(fit1,coef=3,n=50,sort.by="p")
FC = 2^(fit1$coefficients[,3])
FC = ifelse(FC<1,-1/FC,FC)
finalres=topTable(fit1,coef=3,sort="none",n=Inf)

Volcano Plot: Edar Group

suppressPackageStartupMessages(library(googleVis))
library(googleVis)

op <- options(gvis.plot.tag='chart')
volcano_data=as.data.frame(cbind(fit1$coefficients[,3],-1*log10(fit1$p.value[,3])))
volcano_data$pop.html.tooltip=rownames(volcano_data)
yval=ceiling(max(fit1$coefficients[,3]))
Scatter <- gvisScatterChart(volcano_data, 
          options=list(
          tooltip="{isHtml:'True'}",
          legend='none',
          lineWidth=0, pointSize=1,
          title='Edar mut vs het', vAxis="{title:'Log Odds'}",
          hAxis="{title:'Log Fold Change'}", 
          width=1200, height=800,
          hAxes="[{viewWindowMode:'explicit', viewWindow:{min:-10, max:10}}]",
          vAxes="[{viewWindowMode:'explicit', viewWindow:{min:0, max:12}}]"))
plot(Scatter)

Simple Batch Removal Step

Based on the linear model:

Signal = Celltype + Batch + Residual

#Look up Batch Effect:
fit1$design
##   (Intercept) Batch2 celltypeedar/edar mutant
## 1           1      0                        0
## 2           1      1                        0
## 3           1      1                        0
## 4           1      1                        1
## 5           1      1                        1
## 6           1      0                        1
## attr(,"assign")
## [1] 0 1 2
## attr(,"contrasts")
## attr(,"contrasts")$Batch
## [1] "contr.treatment"
## 
## attr(,"contrasts")$celltype
## [1] "contr.treatment"
head(fit1$coefficients)
##              (Intercept)  Batch2 celltypeedar/edar mutant
## Xkr4                1.05  0.0100                   0.6673
## Sox17               2.89 -0.0827                   0.0601
## Mrpl15              5.75  0.1398                   0.0595
## RP23-34E15.5       -1.10  0.1745                   0.1878
## Lypla1              6.00 -0.2218                   0.2376
## Tcea1               6.33 -0.0728                   0.0421
targets
##               bam         Phenotype Batch    mapped
## 1  Sample_e10.bam    edar/+ control     1  76698538
## 2   Sample_e1.bam    edar/+ control     2  96350389
## 3   Sample_e2.bam    edar/+ control     2  98212381
## 4   Sample_e4.bam  edar/edar mutant     2 104174178
## 5   Sample_e5.bam  edar/edar mutant     2  97482428
## 6   Sample_e9.bam  edar/edar mutant     1  56689636
## 7   Sample_f1.bam fgf20lz/+ control     3  75223785
## 8   Sample_f2.bam fgf20lz/+ control     3  81814741
## 9   Sample_f4.bam fgf20lz/lz mutant     3  53524329
## 10  Sample_f5.bam fgf20lz/+ control     3  79315207
## 11  Sample_f6.bam fgf20lz/lz mutant     3  84455037
## 12  Sample_f7.bam fgf20lz/lz mutant     3  80703536
v1coeff = fit1$coefficients

#Remove batch effect from Batch2 
w1=v1
w1$E=v1$E
w1$E[,2:5]=v1$E[,2:5]-v1coeff[,2]

Heatmap: Edar Group Before and after Batch Removal

library(d3heatmap)
#Original Heatmap (notice batch effect)
topgenes_data=v1$E[rownames(v1$E) %in% rownames(topgenes1),]
topgenes_data=topgenes_data[match(rownames(topgenes1),rownames(topgenes_data)),]
colnames(topgenes_data)=targets$Phenotype[match(colnames(topgenes_data),targets$bam)]
colnames(topgenes_data)= sapply(strsplit(colnames(topgenes_data),split=" "),function(x) x[1])
colnames(topgenes_data)=paste(colnames(topgenes_data),targets$Batch[1:6])

d3heatmap(topgenes_data, scale = "row", dendrogram = "none", main = "title",
    color = "YlOrRd",cexRow=1,main="topgenes")

v1res=(residuals.MArrayLM(fit1,v1)) # calculate residuals
head(v1res)
##              Sample_e10.bam Sample_e1.bam Sample_e2.bam Sample_e4.bam
## Xkr4                 0.0964     -0.187085        0.1092       0.40636
## Sox17                0.0393     -0.080253        0.0478       0.00823
## Mrpl15              -0.0704      0.000611        0.0643      -0.01613
## RP23-34E15.5         0.1190      0.029394       -0.1242       0.36271
## Lypla1              -0.1017     -0.037356        0.1373      -0.06012
## Tcea1               -0.0426      0.071711       -0.0302       0.02089
##              Sample_e5.bam Sample_e9.bam
## Xkr4               -0.3751       -0.0897
## Sox17               0.0237       -0.0506
## Mrpl15             -0.0480        0.0790
## RP23-34E15.5       -0.2948       -0.1375
## Lypla1             -0.0359        0.1068
## Tcea1              -0.0625        0.0456
v1fit=(fitted.MArrayLM(fit1,design1)) # get fitted values
head(v1fit)
##                  1      2      3      4      5      6
## Xkr4          1.05  1.063  1.063  1.730  1.730  1.720
## Sox17         2.89  2.807  2.807  2.867  2.867  2.950
## Mrpl15        5.75  5.886  5.886  5.945  5.945  5.805
## RP23-34E15.5 -1.10 -0.929 -0.929 -0.741 -0.741 -0.915
## Lypla1        6.00  5.779  5.779  6.017  6.017  6.238
## Tcea1         6.33  6.261  6.261  6.303  6.303  6.375
head(v1$E) # original = fitted + residuals
##              Sample_e10.bam Sample_e1.bam Sample_e2.bam Sample_e4.bam
## Xkr4                  1.149         0.876          1.17         2.136
## Sox17                 2.929         2.726          2.85         2.875
## Mrpl15                5.675         5.886          5.95         5.929
## RP23-34E15.5         -0.984        -0.899         -1.05        -0.378
## Lypla1                5.899         5.742          5.92         5.957
## Tcea1                 6.291         6.332          6.23         6.324
##              Sample_e5.bam Sample_e9.bam
## Xkr4                  1.35          1.63
## Sox17                 2.89          2.90
## Mrpl15                5.90          5.88
## RP23-34E15.5         -1.04         -1.05
## Lypla1                5.98          6.35
## Tcea1                 6.24          6.42
topgenes_data=w1$E[rownames(w1$E) %in% rownames(topgenes1),]
topgenes_data=topgenes_data[match(rownames(topgenes1),rownames(topgenes_data)),]
colnames(topgenes_data)=targets$Phenotype[match(colnames(topgenes_data),targets$bam)]
colnames(topgenes_data)= sapply(strsplit(colnames(topgenes_data),split=" "),function(x) x[1])
colnames(topgenes_data)=paste(colnames(topgenes_data),targets$Batch[1:6])
d3heatmap(topgenes_data, scale = "row", dendrogram = "none",
    color = "YlOrRd",cexRow=1,main="topgenes")

genes_data=v1$E[rownames(v1$E) %in% genes,]
colnames(genes_data)=targets$Phenotype[match(colnames(genes_data),targets$bam)]
colnames(genes_data)= sapply(strsplit(colnames(genes_data),split=" "),function(x) x[1])
d3heatmap(genes_data, scale = "row", dendrogram = "none", main="title",
    color = "YlOrRd",cexRow=1)

genes_data=w1$E[rownames(w1$E) %in% genes,]
colnames(genes_data)=targets$Phenotype[match(colnames(genes_data),targets$bam)]
colnames(genes_data)= sapply(strsplit(colnames(genes_data),split=" "),function(x) x[1])
d3heatmap(genes_data, scale = "row", dendrogram = "none",
    color = "YlOrRd",cexRow=1)

Barchart: Edar Group Before and after Batch Removal

vdf=cbind(v1$E,w1$E)
genes_count=vdf[rownames(vdf) %in% genes,]

for (i in 1:length(genes)){
gbar=vdf[rownames(vdf)==genes[i],]
if(length(gbar)>0){
names(gbar)=targets$Phenotype[match(names(gbar),targets$bam)]
names(gbar)= sapply(strsplit(names(gbar),split=" "),function(x) x[1])
names(gbar)[7:12]=names(gbar)[1:6]
groupcol=as.factor(names(gbar))
colors = c("green", "orange", "green", "violet", "orange", "red", "pink", "cyan") 
barplot(2^gbar,col=colors[as.numeric(groupcol)], las = 2,main=paste(genes[i]," Counts: Normalized",sep=""),ylab="normalized cpm")
}
}

PCA Plot: After Batch Removal

edf=as.matrix(w1$E)
tedf= t(edf)
pca=prcomp(tedf,scale.=T)
tedf1 = data.frame(tedf)

Phenotype=sapply(strsplit(targets$Phenotype[1:6],split=" "),function(x) x[1])
Batch=targets$Batch[1:6]
tedf1$group = as.factor(Phenotype)
plot(pca,type="lines")  #Decide how many PC's are relevant for plotting

pca$x[,1:3]  #look at first 3 PC's
##                  PC1    PC2    PC3
## Sample_e10.bam -76.7  55.90  41.53
## Sample_e1.bam  -94.5 -24.81 -87.94
## Sample_e2.bam   54.6  96.85  -8.42
## Sample_e4.bam  -15.9 -73.78  89.63
## Sample_e5.bam   51.4   4.26  14.24
## Sample_e9.bam   81.0 -58.42 -49.03
plot3d(pca$x[,1:3],col = as.integer(tedf1$group),type="s",size=2)
group.v<-as.vector(paste(Phenotype,Batch))
text3d(pca$x, pca$y, pca$z, group.v, cex=1.0, adj = 1.2) 
#rgl.postscript("pca3d_indiv_edar_batch.pdf","pdf")

unnamed_chunk_17snapshot
You must enable Javascript to view this page properly.

finalres = cbind(w1$E,FC,finalres)
datadir="/Users/maggiec/Github/Maggie/ccbr620/Results"
setwd(datadir)
write.table(finalres,file="Gencode_FCpval_edar.txt",
      row.names=TRUE,col.names=TRUE,sep="\t",quote=FALSE)

Statistical Analysis of Fgf group

design2 <- model.matrix(~celltype)
fit2 <- lmFit(v2,design2) 
fit2 <- eBayes(fit2) 
options(digits=3) 
topgenes2=topTable(fit2,coef=2,n=50,sort.by="p")
finalres2=topTable(fit2,coef=2,sort="none",n=Inf)

FC2 = 2^(fit2$coefficients[,2])
FC2 = ifelse(FC2<1,-1/FC2,FC2)
finalres2 = cbind(v2$E,FC2,finalres2)
datadir="/Users/maggiec/GitHub/Maggie/ccbr620/Results"
setwd(datadir)
write.table(finalres2,file="Gencode_FCpval_Fgf.txt",
      row.names=TRUE,col.names=TRUE,sep="\t",quote=FALSE)

Volcano Plot: Fgf Group

suppressPackageStartupMessages(library(googleVis))
library(googleVis)

op <- options(gvis.plot.tag='chart')
volcano_data=as.data.frame(cbind(fit2$coefficients[,2],-1*log10(fit2$p.value[,2])))
volcano_data$pop.html.tooltip=rownames(volcano_data)
yval=ceiling(max(fit2$coefficients[,2]))
Scatter <- gvisScatterChart(volcano_data, 
          options=list(
          tooltip="{isHtml:'True'}",
          legend='none',
          lineWidth=0, pointSize=1,
          title='Fgf mut vs het', vAxis="{title:'Log Odds'}",
          hAxis="{title:'Log Fold Change'}", 
          width=1200, height=800,
          hAxes="[{viewWindowMode:'explicit', viewWindow:{min:-10, max:10}}]",
          vAxes="[{viewWindowMode:'explicit', viewWindow:{min:0, max:6}}]"))
plot(Scatter)

Heatmap: Fgf Group

library(d3heatmap)
#Original Heatmap (notice batch effect)
w2=v2
w2$E=v2$E[,c(1,2,4,3,5,6)]
topgenes_data2=w2$E[rownames(w2$E) %in% rownames(topgenes2),]
topgenes_data2=topgenes_data2[match(rownames(topgenes2),rownames(topgenes_data2)),]
colnames(topgenes_data2)=targets$Phenotype[match(colnames(topgenes_data2),targets$bam)]
colnames(topgenes_data2)= sapply(strsplit(colnames(topgenes_data2),split=" "),function(x) x[1])
colnames(topgenes_data2)=paste(colnames(topgenes_data2),targets$Batch[7:12])

d3heatmap(topgenes_data2, scale = "row", dendrogram = "none", main = "title",
    color = "YlOrRd",cexRow=1,main="topgenes")

genes_data=w2$E[rownames(w2$E) %in% genes,]
colnames(genes_data)=targets$Phenotype[match(colnames(genes_data),targets$bam)]
colnames(genes_data)= sapply(strsplit(colnames(genes_data),split=" "),function(x) x[1])
d3heatmap(genes_data, scale = "row", dendrogram = "none", main="title",
    color = "YlOrRd",cexRow=1)

Boxplots: Fgf Group

vdf=w2$E
for (i in 1:length(genes)){
gbar=vdf[rownames(vdf)==genes[i],]
if(length(gbar)>0){
names(gbar)=targets$Phenotype[match(names(gbar),targets$bam)]
names(gbar)= sapply(strsplit(names(gbar),split=" "),function(x) x[1])
groupcol=as.factor(names(gbar))
colors = c("blue", "yellow", "green", "violet", "orange", "red", "pink", "cyan") 
barplot(2^gbar,col=colors[as.numeric(groupcol)], las = 2,main=paste(genes[i]," Counts: Normalized",sep=""),ylab="normalized cpm")
}
else {
  paste(genes," is not present")
}
}

Provenance:

## R version 3.2.1 (2015-06-18)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.4 (Yosemite)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] d3heatmap_0.6.1      googleVis_0.5.10     reshape_0.8.5       
##  [4] knitr_1.11           rgl_0.95.1367        ggplot2_1.0.1       
##  [7] edgeR_3.12.0         limma_3.26.0         Rsubread_1.20.0     
## [10] BiocInstaller_1.20.0
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.1        magrittr_1.5       MASS_7.3-44       
##  [4] munsell_0.4.2      colorspace_1.2-6   stringr_1.0.0     
##  [7] plyr_1.8.3         tools_3.2.1        grid_3.2.1        
## [10] gtable_0.1.2       png_0.1-7          htmltools_0.2.6   
## [13] yaml_2.1.13        digest_0.6.8       RJSONIO_1.3-0     
## [16] RColorBrewer_1.1-2 reshape2_1.4.1     formatR_1.2.1     
## [19] htmlwidgets_0.5    base64enc_0.1-3    evaluate_0.8      
## [22] rmarkdown_0.8.1    labeling_0.3       stringi_0.5-5     
## [25] scales_0.3.0       jsonlite_0.9.17    proto_0.3-10